In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
In [2]:
data = pd.read_csv("scholarship.csv")
In [3]:
data.head()
Out[3]:
semester_percentage | scholarship_exam_marks | got_scholarship | |
---|---|---|---|
0 | 71.9 | 26 | 1 |
1 | 74.6 | 38 | 1 |
2 | 75.4 | 40 | 1 |
3 | 64.2 | 8 | 1 |
4 | 72.3 | 17 | 0 |
In [4]:
sns.distplot(data["scholarship_exam_marks"])
C:\Users\Satyam\AppData\Local\Temp\ipykernel_3132\828206924.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data["scholarship_exam_marks"])
Out[4]:
<Axes: xlabel='scholarship_exam_marks', ylabel='Density'>
In [5]:
data["scholarship_exam_marks"].skew()
Out[5]:
0.8356419499466834
In [14]:
data["scholarship_exam_marks"].describe()
Out[14]:
count 1000.000000 mean 32.225000 std 19.130822 min 0.000000 25% 17.000000 50% 28.000000 75% 44.000000 max 100.000000 Name: scholarship_exam_marks, dtype: float64
In [7]:
sns.boxplot(data["scholarship_exam_marks"])
Out[7]:
<Axes: ylabel='scholarship_exam_marks'>
In [15]:
Q1 = data["scholarship_exam_marks"].quantile(0.25)
Q3 = data["scholarship_exam_marks"].quantile(0.75)
In [16]:
IQR = Q3 - Q1
In [17]:
min = Q1 - 1.5*IQR
max = Q3 + 1.5*IQR
In [18]:
#trimming
new_data = data[(data["scholarship_exam_marks"]>min) & (data["scholarship_exam_marks"]<max)]
In [20]:
plt.figure(figsize=(16,8))
plt.subplot(2,2,1)
sns.distplot(data["scholarship_exam_marks"])
plt.subplot(2,2,2)
sns.boxplot(data["scholarship_exam_marks"])
plt.subplot(2,2,3)
sns.distplot(new_data["scholarship_exam_marks"])
plt.subplot(2,2,4)
sns.boxplot(new_data["scholarship_exam_marks"])
plt.show()
C:\Users\Satyam\AppData\Local\Temp\ipykernel_3132\2007280676.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data["scholarship_exam_marks"]) C:\Users\Satyam\AppData\Local\Temp\ipykernel_3132\2007280676.py:9: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(new_data["scholarship_exam_marks"])
In [ ]:
#capping
new_data2 = data.copy()
new_data2["scholarship_exam_marks"] = np.where(data["scholarship_exam_marks"]>max,
max,
np.where(data["scholarship_exam_marks"]<min,min, data["scholarship_exam_marks"]))
In [ ]:
new_data2.shape
Out[ ]:
(1000, 3)
In [ ]:
new_data2["scholarship_exam_marks"].describe()
Out[ ]:
count 1000.000000 mean 32.136500 std 18.865419 min 0.000000 25% 17.000000 50% 28.000000 75% 44.000000 max 84.500000 Name: scholarship_exam_marks, dtype: float64